Dependencies

library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.4.0 but the current version is
## 4.4.1; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
## 'SeuratObject' was built with package 'Matrix' 1.7.0 but the current
## version is 1.7.1; it is recomended that you reinstall 'SeuratObject' as
## the ABI for 'Matrix' may have changed
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scDblFinder)
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## 
## Attaching package: 'MatrixGenerics'
## 
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## 
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following object is masked from 'package:SeuratObject':
## 
##     intersect
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## 
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## 
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:utils':
## 
##     findMatches
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:lubridate':
## 
##     %within%
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:sp':
## 
##     %over%
## 
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'Biobase'
## 
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## 
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## 
## Attaching package: 'SummarizedExperiment'
## 
## The following object is masked from 'package:Seurat':
## 
##     Assays
## 
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
set.seed(123)

Import data GSE217846

Publication: Integrative analysis of spatial and single-cell transcriptome data from human pancreatic cancer reveals an intermediate cancer cell population associated with poor prognosis

(https://pmc.ncbi.nlm.nih.gov/articles/PMC10832111/)

dirs <-  list.dirs(path = './input/', recursive = F, full.names = F)


for (x in dirs){
  
 cts <- ReadMtx(mtx= paste0('./input/', x, '/matrix.mtx.gz'),
                             features = paste0('./input/', x, '/features.tsv.gz'),
                             cells=paste0('./input/', x, '/barcodes.tsv.gz'))
  
 assign(x, CreateSeuratObject(counts = cts, min.cells = 3, project = x))
 
}


# merge object

seurat_object <- merge(GSM5831620_5_GEX_4, y=c(GSM5831621_5_GEX_5,GSM5831622_5_GEX_6, GSM5831623_5_GEX_9,GSM5831624_GEX_45_MM),
      add.cell.ids=c(ls()[3:7]))

Preprocessing

# create columns for metadata

# identity
unique(seurat_object@meta.data$orig.ident) # control identity

[1] “GSM5831620_5_GEX_4” “GSM5831621_5_GEX_5” “GSM5831622_5_GEX_6”
[4] “GSM5831623_5_GEX_9” “GSM5831624_GEX_45_MM”

# cell id
seurat_object@meta.data$cell_id <- rownames(seurat_object@meta.data)

# percent mt
seurat_object[["percent.mt"]]<-PercentageFeatureSet(seurat_object, pattern="^MT-")

plot1 <- FeatureScatter(seurat_object, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(seurat_object, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

# parameters from the publication
seurat_object <- subset(seurat_object, subset = percent.mt < 10 & nCount_RNA > 2000 & nFeature_RNA > 500 & nFeature_RNA < 7000)


head(seurat_object)
orig.ident nCount_RNA nFeature_RNA cell_id percent.mt
GSM5831620_5_GEX_4_AAACCTGAGCAACGGT-1 GSM5831620_5_GEX_4 15275 3787 GSM5831620_5_GEX_4_AAACCTGAGCAACGGT-1 1.8788871
GSM5831620_5_GEX_4_AAACCTGAGCACCGTC-1 GSM5831620_5_GEX_4 2462 1522 GSM5831620_5_GEX_4_AAACCTGAGCACCGTC-1 3.8180341
GSM5831620_5_GEX_4_AAACCTGAGTCAAGGC-1 GSM5831620_5_GEX_4 2170 985 GSM5831620_5_GEX_4_AAACCTGAGTCAAGGC-1 1.7972350
GSM5831620_5_GEX_4_AAACCTGAGTCTCGGC-1 GSM5831620_5_GEX_4 2757 1239 GSM5831620_5_GEX_4_AAACCTGAGTCTCGGC-1 7.1091766
GSM5831620_5_GEX_4_AAACCTGCAAAGAATC-1 GSM5831620_5_GEX_4 17419 4277 GSM5831620_5_GEX_4_AAACCTGCAAAGAATC-1 0.7922384
GSM5831620_5_GEX_4_AAACCTGCAAGAGTCG-1 GSM5831620_5_GEX_4 11342 3304 GSM5831620_5_GEX_4_AAACCTGCAAGAGTCG-1 0.8993123
GSM5831620_5_GEX_4_AAACCTGCAATCTGCA-1 GSM5831620_5_GEX_4 9774 2783 GSM5831620_5_GEX_4_AAACCTGCAATCTGCA-1 2.0360139
GSM5831620_5_GEX_4_AAACCTGCAATTCCTT-1 GSM5831620_5_GEX_4 21284 4344 GSM5831620_5_GEX_4_AAACCTGCAATTCCTT-1 0.0845706
GSM5831620_5_GEX_4_AAACCTGCAGTATAAG-1 GSM5831620_5_GEX_4 26405 4383 GSM5831620_5_GEX_4_AAACCTGCAGTATAAG-1 2.7797766
GSM5831620_5_GEX_4_AAACCTGCATAGACTC-1 GSM5831620_5_GEX_4 4595 1577 GSM5831620_5_GEX_4_AAACCTGCATAGACTC-1 9.4885745
seurat_object
## An object of class Seurat 
## 24192 features across 32681 samples within 1 assay 
## Active assay: RNA (24192 features, 0 variable features)
##  5 layers present: counts.GSM5831620_5_GEX_4, counts.GSM5831621_5_GEX_5, counts.GSM5831622_5_GEX_6, counts.GSM5831623_5_GEX_9, counts.GSM5831624_GEX_45_MM
plot1 <- FeatureScatter(seurat_object, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(seurat_object, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

# double check filtering
min(seurat_object@meta.data$nFeature_RNA)
## [1] 505
max(seurat_object@meta.data$nFeature_RNA)
## [1] 6991
min(seurat_object@meta.data$nCount_RNA)
## [1] 2002
max(seurat_object@meta.data$percent.mt)
## [1] 9.997791

Control for doublets

for (i in c("GSM5831620_5_GEX_4","GSM5831621_5_GEX_5","GSM5831622_5_GEX_6","GSM5831623_5_GEX_9","GSM5831624_GEX_45_MM")){
    sub <- subset(seurat_object, subset = orig.ident == i)
    eval(parse(text=paste("sceDblF_",i," <- scDblFinder(GetAssayData(sub, layer = 'counts'), dbr = 0.07)",sep="")))
    eval(parse(text=paste("score.",i," <- sceDblF_",i,"@colData@listData[['scDblFinder.score']]",sep="")))
    eval(parse(text=paste("names(score.",i,") <- rownames(sceDblF_",i,"@colData)",sep="")))
}
## Assuming the input to be a matrix of counts or expected counts.
## Creating ~7249 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1512 cells excluded from training.
## iter=1, 1559 cells excluded from training.
## iter=2, 1572 cells excluded from training.
## Threshold found:0.499
## 995 (11%) doublets called
## Assuming the input to be a matrix of counts or expected counts.
## Creating ~9278 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1428 cells excluded from training.
## iter=1, 1499 cells excluded from training.
## iter=2, 1496 cells excluded from training.
## Threshold found:0.409
## 1344 (11.6%) doublets called
## Assuming the input to be a matrix of counts or expected counts.
## Creating ~3198 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 639 cells excluded from training.
## iter=1, 667 cells excluded from training.
## iter=2, 668 cells excluded from training.
## Threshold found:0.473
## 355 (8.9%) doublets called
## Assuming the input to be a matrix of counts or expected counts.
## Creating ~4848 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 937 cells excluded from training.
## iter=1, 1005 cells excluded from training.
## iter=2, 1016 cells excluded from training.
## Threshold found:0.539
## 711 (11.7%) doublets called
## Assuming the input to be a matrix of counts or expected counts.
## Creating ~1574 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 299 cells excluded from training.
## iter=1, 291 cells excluded from training.
## iter=2, 294 cells excluded from training.
## Threshold found:0.454
## 96 (4.9%) doublets called
doublets.info <- rbind(sceDblF_GSM5831620_5_GEX_4@colData,sceDblF_GSM5831621_5_GEX_5@colData,sceDblF_GSM5831622_5_GEX_6@colData,
                       sceDblF_GSM5831623_5_GEX_9@colData,sceDblF_GSM5831624_GEX_45_MM@colData)
seurat_object$is.doublet <- doublets.info$scDblFinder.class

seurat_object <- subset(seurat_object, subset = is.doublet == 'singlet')

Seurat workflow

# default parameters
seurat_object<-NormalizeData(seurat_object)
## Normalizing layer: counts.GSM5831620_5_GEX_4
## Normalizing layer: counts.GSM5831621_5_GEX_5
## Normalizing layer: counts.GSM5831622_5_GEX_6
## Normalizing layer: counts.GSM5831623_5_GEX_9
## Normalizing layer: counts.GSM5831624_GEX_45_MM
seurat_object<-FindVariableFeatures(seurat_object, selection.method = "vst", nfeatures=2000)
## Finding variable features for layer counts.GSM5831620_5_GEX_4
## Finding variable features for layer counts.GSM5831621_5_GEX_5
## Finding variable features for layer counts.GSM5831622_5_GEX_6
## Finding variable features for layer counts.GSM5831623_5_GEX_9
## Finding variable features for layer counts.GSM5831624_GEX_45_MM
all.genes<-rownames(seurat_object)
seurat_object<-ScaleData(seurat_object, features=all.genes)
## Centering and scaling data matrix
seurat_object <- RunPCA(seurat_object, features = VariableFeatures(object = seurat_object))
## PC_ 1 
## Positive:  SPINT2, KRT19, KRT8, CLDN4, EPCAM, KRT18, ELF3, FXYD3, S100P, VAMP8 
##     SMIM22, TSPAN1, LSR, SLPI, RAB25, OCIAD2, NQO1, CLDN7, PLPP2, AGR2 
##     SPINT1, EZR, TACSTD2, KRT7, SYNGR2, MLPH, LAMB3, MUC1, TMPRSS4, GPRC5A 
## Negative:  SPARC, BGN, CALD1, IGFBP7, VIM, DCN, COL1A2, SERPINF1, COL3A1, LUM 
##     SERPING1, COL1A1, C1S, THY1, MYL9, AEBP1, MXRA8, TAGLN, SERPINH1, COL6A2 
##     PCOLCE, IGFBP4, ID3, SFRP2, RARRES2, CTSK, MMP2, COL6A3, MGP, FBLN1 
## PC_ 2 
## Positive:  LUM, RARRES2, TPM1, SFRP2, COL1A1, FBLN1, COL1A2, INHBA, THBS2, CTSK 
##     DCN, ISLR, MXRA8, COL10A1, COL3A1, COL8A1, CTHRC1, COL6A3, CCDC80, AEBP1 
##     SERPINF1, VCAN, C1S, CDH11, COL5A1, TPM2, SDC1, LMO7, PALLD, MXRA5 
## Negative:  PLVAP, PECAM1, RAMP2, CLEC14A, CD93, RAMP3, VWF, ESAM, EMCN, CYYR1 
##     CDH5, AQP1, EGFL7, CLDN5, ADGRL4, CD34, FLT1, CXorf36, IL3RA, TIE1 
##     S1PR1, CALCRL, FAM167B, MMRN2, SLCO2A1, TMEM255B, PTPRB, RBP7, GIMAP7, ROBO4 
## PC_ 3 
## Positive:  TGM2, CDA, DHRS9, S100A4, FAM83A, LY6D, SNCG, EMP3, FYB1, S100A2 
##     EREG, MMP28, IGFBP6, FXYD5, SLC16A3, KRT7, RAC2, ANXA1, SYT8, CLIC3 
##     CSTB, LAPTM5, PPL, WFDC3, TYMP, CST4, TXNIP, SLC2A1, CAV1, WFDC2 
## Negative:  TESC, FAM3B, CA2, SMIM24, SPINK1, TSPAN8, CLDN18, AKR7A3, LYZ, MUC13 
##     TM4SF5, LGALS4, VSIG1, CD24, MSMB, IQGAP2, C12orf75, MUC6, PDIA2, REG1A 
##     TFF1, ANXA10, AQP5, HSD17B2, NR0B2, GMDS, MUC5AC, CLU, FOXA3, ARSE 
## PC_ 4 
## Positive:  NDUFA4L2, RGS5, COX4I2, HIGD1B, EFHD1, NOTCH3, PPP1R14A, PTP4A3, GJA4, ADAP2 
##     HEYL, MCAM, MYH11, SOD3, ENPEP, ADIRF, MAP3K7CL, CCDC102B, GPX3, FAM162B 
##     RASL12, CSPG4, CSRP2, PLXDC1, GUCY1B1, SSTR2, TBX2, MAP1B, CDH6, WFDC1 
## Negative:  COL11A1, COL10A1, CTHRC1, DPYSL3, MMP2, THBS2, COL8A1, SFRP2, VCAN, FNDC1 
##     ITGA11, HTRA1, SDC1, INHBA, GJA1, HSPG2, LRRC15, MXRA5, PLXDC2, CD55 
##     GREM1, GJB2, COL12A1, SLC6A6, MFAP5, FBLN2, ANXA2, IGFBP3, MFAP2, C1QTNF3 
## PC_ 5 
## Positive:  PSCA, CP, TXNIP, DHRS9, PRSS22, ZG16B, GSN, LEMD1, LGALS3, CLIC3 
##     TGM2, SMIM5, WFDC2, SPINT1, CYP3A5, MUC16, TMC5, TNFAIP2, PIGR, SPAG4 
##     EGLN3, PRSS8, LPCAT4, SLPI, TCIM, ADAM28, RHOV, MUC4, DUOX2, BCAS1 
## Negative:  UBE2C, MKI67, BIRC5, TPX2, TOP2A, CENPF, NUSAP1, CDC20, CDK1, CCNB2 
##     PTTG1, CDCA3, HMMR, PLK1, CCNB1, GTSE1, CCNA2, ANLN, ZWINT, ASPM 
##     MAD2L1, CKAP2L, NUF2, TK1, PKMYT1, TYMS, UBE2T, PBK, TROAP, CDKN3
seurat_object <- RunUMAP(seurat_object, reduction='pca', dims = 1:20)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 09:27:11 UMAP embedding parameters a = 0.9922 b = 1.112
## 09:27:11 Read 29180 rows and found 20 numeric columns
## 09:27:11 Using Annoy for neighbor search, n_neighbors = 30
## 09:27:11 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 09:27:12 Writing NN index file to temp file /var/folders/8b/2_66jk7x0jl1qc4177kp8svm0000gq/T//RtmpjbwV55/filedc9742a7a245
## 09:27:12 Searching Annoy index using 1 thread, search_k = 3000
## 09:27:18 Annoy recall = 100%
## 09:27:20 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 09:27:21 Initializing from normalized Laplacian + noise (using RSpectra)
## 09:27:22 Commencing optimization for 200 epochs, with 1228714 positive edges
## 09:27:30 Optimization finished
seurat_object <- FindNeighbors(seurat_object, reduction = 'pca', dims = 1:50)
## Computing nearest neighbor graph
## Computing SNN
seurat_object <- FindClusters(seurat_object, resolution = c(0.4))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 29180
## Number of edges: 1084133
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9594
## Number of communities: 29
## Elapsed time: 2 seconds
DimPlot(seurat_object, reduction='pca', label = TRUE)

DimPlot(seurat_object, label = TRUE)

p1 <- DimPlot(seurat_object, reduction = 'umap')
p2 <- DimPlot(seurat_object, reduction = 'umap',  group.by = c("orig.ident"))

p1+p2

Integrate samples

seurat_object <- IntegrateLayers(object = seurat_object, method = CCAIntegration, assay = "RNA", orig.reduction = "pca", new.reduction = "integrated.cca",
                                 scale.layer = "scale.data",verbose = FALSE)


seurat_object <- RunUMAP(seurat_object, reduction='integrated.cca', dims = 1:50)
## 09:43:30 UMAP embedding parameters a = 0.9922 b = 1.112
## 09:43:30 Read 29180 rows and found 50 numeric columns
## 09:43:30 Using Annoy for neighbor search, n_neighbors = 30
## 09:43:30 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 09:43:31 Writing NN index file to temp file /var/folders/8b/2_66jk7x0jl1qc4177kp8svm0000gq/T//RtmpjbwV55/filedc975eef0c2
## 09:43:32 Searching Annoy index using 1 thread, search_k = 3000
## 09:43:36 Annoy recall = 100%
## 09:43:37 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 09:43:38 Initializing from normalized Laplacian + noise (using RSpectra)
## 09:43:39 Commencing optimization for 200 epochs, with 1307460 positive edges
## 09:43:47 Optimization finished
seurat_object <- FindNeighbors(seurat_object, reduction = 'integrated.cca', dims = 1:50)
## Computing nearest neighbor graph
## Computing SNN
seurat_object <- FindClusters(seurat_object, resolution = 0.3)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 29180
## Number of edges: 1172342
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9403
## Number of communities: 16
## Elapsed time: 4 seconds
DimPlot(seurat_object, label = TRUE)

p1 <- DimPlot(seurat_object, reduction = 'umap', label = T)
p2 <- DimPlot(seurat_object, reduction = 'umap',  group.by = c("orig.ident"))

p1+p2

Identify cells of interest

# fibroblast/CAF markers/endothelial cells
VlnPlot(seurat_object, features = toupper(c("Col1a1", "Col1a2", "Col3a1",
                                    "Ngfr", "Pdgfra", "Acta2", "Cd74", "Plvap")))

# tumor markers
VlnPlot(seurat_object, features = toupper(c("Epcam",  "Krt8", "Krt7", "Krt19",  "Top2a", "Gata6", "Hmga2")))

#acinar/ductal/ADM
VlnPlot(seurat_object, features = toupper(c("Krt19", "Cftr",  "Cpb1", "Cpa1", "Cela2a", "Sox9", "Mcam", "Spp1", "Reg3a", "Crp", "Plvap")))
## Warning: Removed 3642 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 3642 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 3642 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 3642 rows containing missing values or values outside the scale range
## (`geom_point()`).

# fibroblasts/ CAF markers/ endothelial cells
FeaturePlot(seurat_object, features = toupper(c("Col1a1", "Col1a2", "Col3a1",
                                    "Ngfr", "Pdgfra", "Acta2", "Cd74", "Plvap")))

#acinar/ductal/ADM
FeaturePlot(seurat_object, features = toupper(c("Krt19", "Cftr",  "Cpb1", "Cpa1", "Cela2a", "Sox9", "Mcam", "Spp1", "Reg3a", "Crp", "Plvap")))

# tumor
FeaturePlot(seurat_object, features = toupper(c("Epcam",  "Krt8", "Krt7", "Krt19",  "Top2a", "Hmga2", "Gata6")))

# marker's from the paper
Ep <- c("EPCAM","ZBED6", "VGLL1", "TRIM54", "PIFO", "MSMB", "KRT6A", "FXYD2", "CDK1")
Fb <- c("COL1A1" , "VIT", "STRA6", "SFRP1", "MSLN", "LRRC15", "COL9A1", "CDK1")
VlnPlot(seurat_object, features = c(Fb, "PLVAP"))

VlnPlot(seurat_object, features = Ep)

FeaturePlot(seurat_object, features = c(Fb, "PLVAP"))

FeaturePlot(seurat_object, features = Ep)

# rename identified clusters
seurat_object <- RenameIdents(object = seurat_object, `1` = "Fibroblasts_1")
seurat_object <- RenameIdents(object = seurat_object, `2` = "Fibroblasts_2")
seurat_object <- RenameIdents(object = seurat_object, `4` = "Endothelial")
seurat_object <- RenameIdents(object = seurat_object, `5` = "Stellate")
seurat_object <- RenameIdents(object = seurat_object, `6` = "Fibroblasts_3")
seurat_object <- RenameIdents(object = seurat_object, `7` = "Fibroblasts_4")
seurat_object <- RenameIdents(object = seurat_object, `12` = "Fibroblasts_5")
seurat_object <- RenameIdents(object = seurat_object, `13` = "Fibroblasts_6")

seurat_object <- RenameIdents(object = seurat_object, `3` = "Epithelial_1")
seurat_object <- RenameIdents(object = seurat_object, `0` = "Epithelial_2")
seurat_object <- RenameIdents(object = seurat_object, `9` = "Epithelial_3")
seurat_object <- RenameIdents(object = seurat_object, `8` = "Epithelial_4")
seurat_object <- RenameIdents(object = seurat_object, `15` = "Epithelial_5")

seurat_object <- RenameIdents(object = seurat_object, `10` = "Premalignant")


DimPlot(seurat_object, reduction = "umap", label = T)

seurat_object@meta.data$CellType_cluster <- Idents(seurat_object)
seurat_object@meta.data$CellType <- gsub('\\_.*','',seurat_object@meta.data$CellType_cluster)

unique(seurat_object@meta.data$CellType_cluster)
##  [1] Epithelial_2  Endothelial   Fibroblasts_1 Fibroblasts_2 Epithelial_3 
##  [6] Premalignant  Epithelial_1  Fibroblasts_3 Stellate      11           
## [11] Epithelial_4  Fibroblasts_4 Fibroblasts_5 14            Fibroblasts_6
## [16] Epithelial_5 
## 16 Levels: Premalignant Epithelial_5 Epithelial_4 Epithelial_3 ... 14
unique(seurat_object@meta.data$CellType)
## [1] "Epithelial"   "Endothelial"  "Fibroblasts"  "Premalignant" "Stellate"    
## [6] "11"           "14"
seurat_object
## An object of class Seurat 
## 24192 features across 29180 samples within 1 assay 
## Active assay: RNA (24192 features, 2000 variable features)
##  11 layers present: counts.GSM5831620_5_GEX_4, counts.GSM5831621_5_GEX_5, counts.GSM5831622_5_GEX_6, counts.GSM5831623_5_GEX_9, counts.GSM5831624_GEX_45_MM, data.GSM5831620_5_GEX_4, data.GSM5831621_5_GEX_5, data.GSM5831622_5_GEX_6, data.GSM5831623_5_GEX_9, data.GSM5831624_GEX_45_MM, scale.data
##  3 dimensional reductions calculated: pca, umap, integrated.cca
head(seurat_object)
orig.ident nCount_RNA nFeature_RNA cell_id percent.mt is.doublet RNA_snn_res.0.4 seurat_clusters RNA_snn_res.0.3 CellType_cluster CellType
GSM5831620_5_GEX_4_AAACCTGAGCAACGGT-1 GSM5831620_5_GEX_4 15275 3787 GSM5831620_5_GEX_4_AAACCTGAGCAACGGT-1 1.8788871 singlet 8 0 0 Epithelial_2 Epithelial
GSM5831620_5_GEX_4_AAACCTGAGCACCGTC-1 GSM5831620_5_GEX_4 2462 1522 GSM5831620_5_GEX_4_AAACCTGAGCACCGTC-1 3.8180341 singlet 3 4 4 Endothelial Endothelial
GSM5831620_5_GEX_4_AAACCTGAGTCAAGGC-1 GSM5831620_5_GEX_4 2170 985 GSM5831620_5_GEX_4_AAACCTGAGTCAAGGC-1 1.7972350 singlet 10 1 1 Fibroblasts_1 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGAGTCTCGGC-1 GSM5831620_5_GEX_4 2757 1239 GSM5831620_5_GEX_4_AAACCTGAGTCTCGGC-1 7.1091766 singlet 2 2 2 Fibroblasts_2 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGCAAAGAATC-1 GSM5831620_5_GEX_4 17419 4277 GSM5831620_5_GEX_4_AAACCTGCAAAGAATC-1 0.7922384 singlet 1 1 1 Fibroblasts_1 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGCAAGAGTCG-1 GSM5831620_5_GEX_4 11342 3304 GSM5831620_5_GEX_4_AAACCTGCAAGAGTCG-1 0.8993123 singlet 3 4 4 Endothelial Endothelial
GSM5831620_5_GEX_4_AAACCTGCAATCTGCA-1 GSM5831620_5_GEX_4 9774 2783 GSM5831620_5_GEX_4_AAACCTGCAATCTGCA-1 2.0360139 singlet 2 2 2 Fibroblasts_2 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGCAATTCCTT-1 GSM5831620_5_GEX_4 21284 4344 GSM5831620_5_GEX_4_AAACCTGCAATTCCTT-1 0.0845706 singlet 2 2 2 Fibroblasts_2 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGCATAGACTC-1 GSM5831620_5_GEX_4 4595 1577 GSM5831620_5_GEX_4_AAACCTGCATAGACTC-1 9.4885745 singlet 2 2 2 Fibroblasts_2 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGCATCTCGCT-1 GSM5831620_5_GEX_4 2226 1253 GSM5831620_5_GEX_4_AAACCTGCATCTCGCT-1 0.7187781 singlet 8 0 0 Epithelial_2 Epithelial
# save for easier import
saveRDS(seurat_object, file = "./output/GSE194247_preprocessed.rds")
seurat_object <- readRDS("./output/GSE194247_preprocessed.rds")

head(seurat_object)
orig.ident nCount_RNA nFeature_RNA cell_id percent.mt is.doublet RNA_snn_res.0.4 seurat_clusters RNA_snn_res.0.3 CellType_cluster CellType
GSM5831620_5_GEX_4_AAACCTGAGCAACGGT-1 GSM5831620_5_GEX_4 15275 3787 GSM5831620_5_GEX_4_AAACCTGAGCAACGGT-1 1.8788871 singlet 8 0 0 Epithelial_2 Epithelial
GSM5831620_5_GEX_4_AAACCTGAGCACCGTC-1 GSM5831620_5_GEX_4 2462 1522 GSM5831620_5_GEX_4_AAACCTGAGCACCGTC-1 3.8180341 singlet 3 4 4 Endothelial Endothelial
GSM5831620_5_GEX_4_AAACCTGAGTCAAGGC-1 GSM5831620_5_GEX_4 2170 985 GSM5831620_5_GEX_4_AAACCTGAGTCAAGGC-1 1.7972350 singlet 10 1 1 Fibroblasts_1 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGAGTCTCGGC-1 GSM5831620_5_GEX_4 2757 1239 GSM5831620_5_GEX_4_AAACCTGAGTCTCGGC-1 7.1091766 singlet 2 2 2 Fibroblasts_2 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGCAAAGAATC-1 GSM5831620_5_GEX_4 17419 4277 GSM5831620_5_GEX_4_AAACCTGCAAAGAATC-1 0.7922384 singlet 1 1 1 Fibroblasts_1 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGCAAGAGTCG-1 GSM5831620_5_GEX_4 11342 3304 GSM5831620_5_GEX_4_AAACCTGCAAGAGTCG-1 0.8993123 singlet 3 4 4 Endothelial Endothelial
GSM5831620_5_GEX_4_AAACCTGCAATCTGCA-1 GSM5831620_5_GEX_4 9774 2783 GSM5831620_5_GEX_4_AAACCTGCAATCTGCA-1 2.0360139 singlet 2 2 2 Fibroblasts_2 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGCAATTCCTT-1 GSM5831620_5_GEX_4 21284 4344 GSM5831620_5_GEX_4_AAACCTGCAATTCCTT-1 0.0845706 singlet 2 2 2 Fibroblasts_2 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGCATAGACTC-1 GSM5831620_5_GEX_4 4595 1577 GSM5831620_5_GEX_4_AAACCTGCATAGACTC-1 9.4885745 singlet 2 2 2 Fibroblasts_2 Fibroblasts
GSM5831620_5_GEX_4_AAACCTGCATCTCGCT-1 GSM5831620_5_GEX_4 2226 1253 GSM5831620_5_GEX_4_AAACCTGCATCTCGCT-1 0.7187781 singlet 8 0 0 Epithelial_2 Epithelial

Fibroblast subset

# subset fibroblasts
Idents(seurat_object) <- "CellType"

# join layers
seurat_object <- JoinLayers(seurat_object)
fibroblasts <- subset(seurat_object, idents = c("Fibroblasts"))
# save for easier import
saveRDS(fibroblasts, file = "./output/GSE194247_fibroblasts.rds")
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.4.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Stockholm
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] scDblFinder_1.18.0          SingleCellExperiment_1.26.0
##  [3] SummarizedExperiment_1.34.0 Biobase_2.64.0             
##  [5] GenomicRanges_1.56.2        GenomeInfoDb_1.40.1        
##  [7] IRanges_2.38.1              S4Vectors_0.42.1           
##  [9] BiocGenerics_0.50.0         MatrixGenerics_1.16.0      
## [11] matrixStats_1.5.0           lubridate_1.9.3            
## [13] forcats_1.0.0               stringr_1.5.1              
## [15] purrr_1.0.4                 readr_2.1.5                
## [17] tidyr_1.3.1                 tibble_3.2.1               
## [19] ggplot2_3.5.1               tidyverse_2.0.0            
## [21] dplyr_1.1.4                 Seurat_5.2.1               
## [23] SeuratObject_5.0.2          sp_2.2-0                   
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22          splines_4.4.1            
##   [3] later_1.4.1               BiocIO_1.14.0            
##   [5] bitops_1.0-9              polyclip_1.10-7          
##   [7] XML_3.99-0.17             fastDummies_1.7.5        
##   [9] lifecycle_1.0.4           edgeR_4.2.2              
##  [11] globals_0.16.3            lattice_0.22-6           
##  [13] MASS_7.3-61               magrittr_2.0.3           
##  [15] limma_3.60.6              plotly_4.10.4            
##  [17] sass_0.4.9                rmarkdown_2.29           
##  [19] jquerylib_0.1.4           yaml_2.3.10              
##  [21] metapod_1.12.0            httpuv_1.6.15            
##  [23] sctransform_0.4.1         spam_2.11-1              
##  [25] spatstat.sparse_3.1-0     reticulate_1.40.0        
##  [27] cowplot_1.1.3             pbapply_1.7-2            
##  [29] RColorBrewer_1.1-3        abind_1.4-8              
##  [31] zlibbioc_1.50.0           Rtsne_0.17               
##  [33] RCurl_1.98-1.16           GenomeInfoDbData_1.2.12  
##  [35] ggrepel_0.9.6             irlba_2.3.5.1            
##  [37] listenv_0.9.1             spatstat.utils_3.1-2     
##  [39] goftest_1.2-3             RSpectra_0.16-2          
##  [41] dqrng_0.4.1               spatstat.random_3.3-2    
##  [43] fitdistrplus_1.2-2        parallelly_1.42.0        
##  [45] DelayedMatrixStats_1.26.0 codetools_0.2-20         
##  [47] DelayedArray_0.30.1       scuttle_1.14.0           
##  [49] tidyselect_1.2.1          UCSC.utils_1.0.0         
##  [51] farver_2.1.2              viridis_0.6.5            
##  [53] ScaledMatrix_1.12.0       spatstat.explore_3.3-4   
##  [55] GenomicAlignments_1.40.0  jsonlite_1.9.0           
##  [57] BiocNeighbors_1.22.0      progressr_0.15.1         
##  [59] scater_1.32.1             ggridges_0.5.6           
##  [61] survival_3.7-0            tools_4.4.1              
##  [63] ica_1.0-3                 Rcpp_1.0.14              
##  [65] glue_1.8.0                gridExtra_2.3            
##  [67] SparseArray_1.4.8         xfun_0.51                
##  [69] withr_3.0.2               fastmap_1.2.0            
##  [71] bluster_1.14.0            digest_0.6.37            
##  [73] rsvd_1.0.5                timechange_0.3.0         
##  [75] R6_2.6.1                  mime_0.12                
##  [77] colorspace_2.1-1          scattermore_1.2          
##  [79] tensor_1.5                spatstat.data_3.1-4      
##  [81] generics_0.1.3            data.table_1.16.4        
##  [83] rtracklayer_1.64.0        httr_1.4.7               
##  [85] htmlwidgets_1.6.4         S4Arrays_1.4.1           
##  [87] uwot_0.2.2                pkgconfig_2.0.3          
##  [89] gtable_0.3.6              lmtest_0.9-40            
##  [91] XVector_0.44.0            htmltools_0.5.8.1        
##  [93] dotCall64_1.2             scales_1.3.0             
##  [95] png_0.1-8                 spatstat.univar_3.1-1    
##  [97] scran_1.32.0              knitr_1.49               
##  [99] rstudioapi_0.17.1         tzdb_0.4.0               
## [101] reshape2_1.4.4            rjson_0.2.23             
## [103] nlme_3.1-166              curl_6.2.1               
## [105] cachem_1.1.0              zoo_1.8-12               
## [107] KernSmooth_2.23-24        vipor_0.4.7              
## [109] parallel_4.4.1            miniUI_0.1.1.1           
## [111] ggrastr_1.0.2             restfulr_0.0.15          
## [113] pillar_1.10.1             grid_4.4.1               
## [115] vctrs_0.6.5               RANN_2.6.2               
## [117] promises_1.3.2            BiocSingular_1.20.0      
## [119] beachmat_2.20.0           xtable_1.8-4             
## [121] cluster_2.1.6             beeswarm_0.4.0           
## [123] evaluate_1.0.3            locfit_1.5-9.11          
## [125] cli_3.6.4                 compiler_4.4.1           
## [127] Rsamtools_2.20.0          rlang_1.1.5              
## [129] crayon_1.5.3              future.apply_1.11.3      
## [131] labeling_0.4.3            ggbeeswarm_0.7.2         
## [133] plyr_1.8.9                stringi_1.8.4            
## [135] viridisLite_0.4.2         deldir_2.0-4             
## [137] BiocParallel_1.38.0       munsell_0.5.1            
## [139] Biostrings_2.72.1         lazyeval_0.2.2           
## [141] spatstat.geom_3.3-5       Matrix_1.7-1             
## [143] RcppHNSW_0.6.0            hms_1.1.3                
## [145] patchwork_1.3.0           sparseMatrixStats_1.16.0 
## [147] future_1.34.0             statmod_1.5.0            
## [149] shiny_1.10.0              ROCR_1.0-11              
## [151] igraph_2.1.4              bslib_0.9.0              
## [153] xgboost_1.7.8.1